# -*- coding:utf-8 -*-

import os
import pandas as pd
# import compress_file

def traverse_directory(directory_path):
    items = os.listdir(directory_path)
    
    for item in items:
        ls1 = []
        ls2 = []
        ls3 = []
        ls4 = []
        if "1999" not in item and "2015" in item:
            print(item)
            item_path = os.path.join(directory_path, item)
            file_items = os.listdir(item_path)
            for file_item in file_items:
                if "使用说明" not in file_item and "数据汇总" not in file_item:
                    full_path = os.path.join(item_path, file_item)
                    file_content = process_file(full_path)
                    stock_code, year, corp_name = file_item.split("_")[:3]
                    ls1.append(stock_code)
                    ls2.append(year)
                    ls3.append(corp_name)
                    ls4.append(file_content)
            df = pd.DataFrame({"stock_code": ls1, "year": ls2, "corp_name": ls3, "file_content": ls4})
            df.to_csv("D:\\Code\\subject-word-extraction\\data\\output\\{}.csv.gz".format(item), index=False, compression='gzip')
            print("{}保存成功！".format(item))
            # compress_file.save_in_chunks(df,base_filename=os.path.join(directory_path, item))


def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()
        

if __name__ == "__main__":
    directory_to_traverse = "D:\\Code\\subject-word-extraction\\data\\年度报告"
    traverse_directory(directory_to_traverse)